library(tidyverse)
library(lubridate)
library(caret)
library(nnet)
library(plotly)
library(randomForest)
First, we will import the raw data, clean them, and merge them according to our analysis needs. Then, we will take the monthly average to incorporate other data.
# function to clean and wrangle AIMS Moore Reef data to deal with column names and column placement
clean_MooreReef_data <- function(df) {
df[,2:ncol(df)] <- df[,1:ncol(df)]
df[,1] <- rownames(df)
df <- df %>% select(date, colnames(df)[ncol(df)])
return(df)
}
moore_reef_water_temp_2.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to16Feb2020_2.0m.csv", skip=108, sep= ",", row.names = NULL)
moore_reef_water_temp_9.0m <- read.csv("raw_data/temp/AIMS_MooreReef_WaterTemperature_22Oct1997to17Dec2017_9.0m.csv", skip=94, sep= ",", row.names = NULL)
# run through function defined above to clean and wrangle data
moore_reef_water_temp_2.0m <- clean_MooreReef_data(moore_reef_water_temp_2.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_9.0m <- clean_MooreReef_data(moore_reef_water_temp_9.0m)
## Warning in `[<-.data.frame`(`*tmp*`, , 2:ncol(df), value = structure(list(:
## provided 6 variables to replace 5 variables
moore_reef_water_temp_2.0m <- moore_reef_water_temp_2.0m %>%
filter(Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG != "Not available")
moore_reef_water_temp_9.0m <- moore_reef_water_temp_9.0m %>%
filter(Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG != "Not available")
# merge AIMS Moore reef temperature data
aims_temp_data <- Reduce(function(x,y) merge(x,y, by="date"), list(moore_reef_water_temp_2.0m,
moore_reef_water_temp_9.0m))
# convert water temp data from string to numeric type
aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG <-
as.numeric(as.character(aims_temp_data$Water.Temperature..2.0m.MORFL1.Reef.Flat.Site.1_LEVEL2_value_AVG))
aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG <-
as.numeric(as.character(aims_temp_data$Water.Temperature..9.0m.MORSL1.Reef.Slope.Site.1_LEVEL2_value_AVG))
# convert to date column into date type
aims_temp_data$date <- as.Date(aims_temp_data$date)
colnames(aims_temp_data) <- c("date", "avg_water_temp_2.0m_flat_site", "avg_water_temp_9.0m_slope_site")
# write combined AIMS Moore reef data
write.csv(aims_temp_data, "cleaned_data/aims_temperatures.csv", row.names = FALSE)
# convert date-decimal in coral cover to YYYY-MM-DD format
coral_cover <- read.csv("raw_data/trendgbr-coral-cover-with-ci.csv")
coral_cover$Date <- as.Date(format(date_decimal(coral_cover$Date), "%Y-%m-%d"))
# rename "Date" to "date"
names(coral_cover)[names(coral_cover) == "Date"] <- "date"
colnames(coral_cover) <- c("date", "mean_live_coral_cover_percent", "lower_conf_int", "upper_conf_int", "conf_int_span")
write.csv(coral_cover, "cleaned_data/coral_cover.csv", row.names = FALSE)
fish_census <- read.csv("raw_data/Fish census 1992-2015.csv", sep="\t", header=T)
fish_census <- fish_census %>% select(gbifID, class, family, genus, species, verbatimScientificName, decimalLatitude, decimalLongitude, dateIdentified)
fish_census$dateIdentified <- ymd_hms(fish_census$dateIdentified)
Group fish census by date, then count how many unique fish species were observed on a given date
fish_species_counts <- fish_census %>%
arrange(dateIdentified) %>%
group_by(dateIdentified) %>%
summarise(num_of_species=n_distinct(species))
## `summarise()` ungrouping output (override with `.groups` argument)
# rename "dateIdentified" to "date"
names(fish_species_counts)[names(fish_species_counts) == "dateIdentified"] <- "date"
fish_species_counts$date <- as.Date(fish_species_counts$date)
write.csv(fish_species_counts, "cleaned_data/fish_species_counts.csv", row.names = FALSE)
We would like to see whether we can classify sea grass species based on the survey location (latitude, longitude, and depth below sea level), and the type of sediment and seabed in which they were discovered.
We decided not to use sea grass species belonging to the Halophila genus because they are so widespread and common in tropical waters. Since the halophila genus were also present in nearly all the survey sites where other sea grass species were found, we determined that it did not make much sense from a scientific perspective, as well as from the data set we were given.
seagrass_data <- read.csv("raw_data/GBR_NESP-TWQ-3.2.1-5.4_JCU_Seagrass_1984-2018_Site-surveys.csv") %>%
# filter out for rows where we have information
filter(SEDIMENT != "Not recorded" & PRESENCE_A == "Present") %>%
# delete columns we are not going to use
select(-FID, -MONTH, -YEAR,
-SURVEY_MET, -SURVEY_NAM,
# remove seagrass belonging to halophila genus
-H_CAPRICOR, -H_TRICOSTA, -H_OVALIS, -H_UNINERVI, -H_DECIPIEN, -H_SPINULOS)
Let’s examine how many presence/absence we see for each sea grass species:
table(seagrass_data$C_ROTUNDAT)
##
## No Yes
## 30366 187
table(seagrass_data$C_SERRULAT)
##
## No Yes
## 28899 1654
table(seagrass_data$E_ACOROIDE)
##
## No Yes
## 30494 59
table(seagrass_data$S_ISOETIFO)
##
## No Yes
## 30353 200
table(seagrass_data$T_CILIATUM)
##
## No
## 30553
table(seagrass_data$T_HEMPRICH)
##
## No Yes
## 29674 879
table(seagrass_data$Z_CAPRICOR)
##
## No Yes
## 20074 10479
We see that there is actually no data at all for presence of T_CILIATUM. In addition, there are only 59 observations for E_ACOROIDE and 187 observations for C_ROTUNDAT that are actually useful in classifying these two species. Because we have a rather large data set, we want 200 or more observations for our classification model.So, we remove these columns and from our classification problem due to insufficient data.
seagrass_data <- seagrass_data %>% select(-C_ROTUNDAT, -T_CILIATUM, -E_ACOROIDE)
Since we deleted some columns, there may be some observations where all the remaining species columns have “No” values for absence. So, we filter out for rows where presence of at least one species was observed.
seagrass_data <- seagrass_data %>%
filter(C_SERRULAT=="Yes" |
S_ISOETIFO=="Yes" |
T_HEMPRICH=="Yes" |
Z_CAPRICOR=="Yes")
Now we want to count how many species were found at each location site.
count_species_present <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
count = 0
if (C_SERRULAT=="Yes") {
count <- count + 1
}
if (S_ISOETIFO=="Yes") {
count <- count + 1
}
if (T_HEMPRICH=="Yes") {
count <- count + 1
}
if (Z_CAPRICOR=="Yes") {
count <- count + 1
}
return(count)
}
seagrass_data$num_species_present <- mapply(count_species_present,
seagrass_data$C_SERRULAT,
seagrass_data$S_ISOETIFO,
seagrass_data$T_HEMPRICH,
seagrass_data$Z_CAPRICOR)
How many survey sites had more than 1 species discovered?
table(seagrass_data$num_species_present)
##
## 1 2 3
## 12381 411 3
nrow(seagrass_data)
## [1] 12795
We see that only 3.2% of total survey sites recorded more than 1 species observed. For ease of building a classification model, we will remove these rows. We do not want a situation where our model cannot “decide” in classifying our observations. Because we removed only about 3% of over 12,500 observations, we still preserve statistical power.
seagrass_data <- seagrass_data %>% filter(num_species_present < 2)
table(seagrass_data$C_SERRULAT)
##
## No Yes
## 11125 1256
table(seagrass_data$S_ISOETIFO)
##
## No Yes
## 12280 101
table(seagrass_data$T_HEMPRICH)
##
## No Yes
## 11558 823
table(seagrass_data$Z_CAPRICOR)
##
## No Yes
## 2180 10201
So, after some data wrangling and exploratory analysis, we will build a model to classify 4 species of seagrass: Cymodocea Serrulata , Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni). Now, let’s build a SPECIES column that collects all the presence in a single variable.
# function to make a species column
get_species_type <- function(C_SERRULAT, S_ISOETIFO, T_HEMPRICH, Z_CAPRICOR) {
if (C_SERRULAT=="Yes") {
return("C_SERRULAT")
} else if (S_ISOETIFO=="Yes") {
return("S_ISOETIFO")
} else if (T_HEMPRICH =="Yes") {
return("T_HEMPRICH")
} else if (Z_CAPRICOR =="Yes") {
return("Z_CAPIRCOR")
}
}
# build species column to classify species of each observation based on presence/absence
seagrass_data$SPECIES <- mapply(get_species_type,
seagrass_data$C_SERRULAT,
seagrass_data$S_ISOETIFO,
seagrass_data$T_HEMPRICH,
seagrass_data$Z_CAPRICOR)
table(seagrass_data$SPECIES)
##
## C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
## 1256 101 823 10201
Data frame cleanup
# convert SPECIES to unordered factor
seagrass_data$SPECIES <- factor(seagrass_data$SPECIES, ordered=FALSE)
# rename misspelled column in original data set
names(seagrass_data)[names(seagrass_data) == "LATITUTDE"] <- "LATITUDE"
# only select columns relevant for our ML algorithm
seagrass_data <- seagrass_data %>%
select(SPECIES, LATITUDE, LONGITUDE, DEPTH, SEDIMENT, TIDAL)
Write to .csv file.
write.csv(seagrass_data, "cleaned_data/seagrass_classification_data.csv", row.names = FALSE)
In this analysis, we will examine the effects of water temperature on the number of unique fish species observed in the Great Barrier Reef from 1997 to 2011.
Originally, we wanted to examine a relationship between water temperature and coral cover. However, we quickly saw that, when visualized, there does not seem to be a linear relation and we did not want to fit a mis-specified model.
Merge between temperature and coral cover data sets
temp <- read_csv("cleaned_data/aims_temperatures.csv")
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## avg_water_temp_2.0m_flat_site = col_double(),
## avg_water_temp_9.0m_slope_site = col_double()
## )
coral_cover <- read_csv("cleaned_data/coral_cover.csv")
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## mean_live_coral_cover_percent = col_double(),
## lower_conf_int = col_double(),
## upper_conf_int = col_double(),
## conf_int_span = col_double()
## )
temp_coral_cover <- merge(x=temp, y=coral_cover, by="date")
temp_coral_cover %>%
ggplot(aes(avg_water_temp_2.0m_flat_site, mean_live_coral_cover_percent)) +
geom_point()
temp_coral_cover %>%
ggplot(aes(avg_water_temp_9.0m_slope_site, mean_live_coral_cover_percent)) +
geom_point()
As we can see, there is no linear relationship. While there are 2 or 3 clusters of coral cover percentage values, they seem to be pretty consistent across the range of water temperatures at 2.0m and 9.0 below sea level.
We also suspect that increase in water temperature could also affect diversity in sea life. So, we will examine the relationship between water temperature and the number of unique species of fish observed in the Great Barrier Ree. First, we can do some basic visualization by plotting the relationship between water temperatures at depth 2.0m and 9.0m with the number of unique fish species observedin the Great Barrier Reef.
fish <- read_csv("cleaned_data/fish_species_counts.csv")
## Parsed with column specification:
## cols(
## date = col_date(format = ""),
## num_of_species = col_double()
## )
fish_temp <- merge(x=temp, y=fish, by="date")
# water temp at 2.0m
fish_temp %>% ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) + geom_point()
# water temp at92.0m
fish_temp %>% ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) + geom_point()
There seems to be a bit of negative linear relationship going on, so we will fit a linear model examining the number of unique species discovered in relation to rising temperature.
Now, we can split our data set into train and test sets, using 0.6 to partition our data. Our outcome is the mean coral cover percentage and water temperature is our covariate. We will fit 2 linear regression models: one examining effect of water temperature at 2.0m and the other examining the effect of temperature at 9.0m.
train_index <- createDataPartition(y=fish_temp$num_of_species, times=1, p = 0.6, list=FALSE)
train_set <- fish_temp[train_index, ]
test_set <- fish_temp[-train_index, ]
Fit linear regression model:
fish_temp_2.0m <- lm(num_of_species ~ avg_water_temp_2.0m_flat_site, data=train_set)
summary(fish_temp_2.0m)
##
## Call:
## lm(formula = num_of_species ~ avg_water_temp_2.0m_flat_site,
## data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.879 -8.923 2.119 10.254 39.055
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 117.7762 17.7872 6.621 3.12e-10 ***
## avg_water_temp_2.0m_flat_site -2.0309 0.6461 -3.143 0.00192 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.34 on 203 degrees of freedom
## Multiple R-squared: 0.04641, Adjusted R-squared: 0.04172
## F-statistic: 9.881 on 1 and 203 DF, p-value: 0.00192
fish_temp_9.0m <- lm(num_of_species ~ avg_water_temp_9.0m_slope_site, data=train_set)
summary(fish_temp_9.0m)
##
## Call:
## lm(formula = num_of_species ~ avg_water_temp_9.0m_slope_site,
## data = train_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.966 -8.802 1.979 10.327 39.395
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 119.8436 18.0473 6.641 2.8e-10 ***
## avg_water_temp_9.0m_slope_site -2.1144 0.6582 -3.213 0.00153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.33 on 203 degrees of freedom
## Multiple R-squared: 0.04838, Adjusted R-squared: 0.0437
## F-statistic: 10.32 on 1 and 203 DF, p-value: 0.00153
We see that the models are very similar in results. The coefficient with covariate 2.0m water temperature and 9.0 water temperature is -2.56 and -2.61, respectively.
Although we should not expect a major difference between how each of the two models performs, let’s compare them anyways to assess which water temperature depth is a better predictor of unique fish species observed in the Great Barrier Reef.
pred_2.0m <- predict(fish_temp_2.0m, test_set)
pred_9.0m <- predict(fish_temp_9.0m, test_set)
postResample(pred = pred_2.0m, obs = test_set$num_of_species)
## RMSE Rsquared MAE
## 14.9712171 0.1295988 11.3846304
postResample(pred = pred_9.0m, obs = test_set$num_of_species)
## RMSE Rsquared MAE
## 14.9392278 0.1351032 11.3419986
As expected, results are very similar.
We can assess this visually to confirm our results.
# water temp at 2.0m
test_set %>%
ggplot(aes(avg_water_temp_2.0m_flat_site, num_of_species)) +
geom_point() +
geom_abline(intercept=fish_temp_2.0m$coefficients[1], slope=fish_temp_2.0m$coefficients[2], col="red")
# water temp at 9.0m
test_set %>%
ggplot(aes(avg_water_temp_9.0m_slope_site, num_of_species)) +
geom_point() +
geom_abline(intercept=fish_temp_9.0m$coefficients[1], slope=fish_temp_9.0m$coefficients[2], col="blue")
They both perform very similarly, and choosing either water temperature as our predictor will yield similar results.
Continuing on from sea life diversity, we have data on presence or absence of certain seagrass species in the Great Barrier Reef from 1999 to 2003. Let’s try to build a classifier to determine how location, and types of sediment and seabed may predict presence of certain sea grass.
As written in the data wrangling and cleaning RMarkdown/HTML file, we chose 4 species to classify: Cymodocea Serrulata, Syringodium Isoetifolium, Thalassia Hemprichii, and Zostera Muelleri (subspecies Capricorni).
seagrass <- read.csv("cleaned_data/seagrass_classification_data.csv", as.is =TRUE)
seagrass$SPECIES <- as.factor(seagrass$SPECIES)
seagrass$SEDIMENT <- as.factor(seagrass$SEDIMENT)
seagrass$TIDAL <- as.factor(seagrass$TIDAL)
head(seagrass)
## SPECIES LATITUDE LONGITUDE DEPTH SEDIMENT TIDAL
## 1 Z_CAPIRCOR -23.65857 151.1206 0 Mud intertidal
## 2 Z_CAPIRCOR -23.65840 151.1212 0 Mud intertidal
## 3 Z_CAPIRCOR -23.65898 151.1203 0 Mud intertidal
## 4 Z_CAPIRCOR -23.65970 151.1197 0 Mud intertidal
## 5 Z_CAPIRCOR -23.65884 151.1205 0 Mud intertidal
## 6 Z_CAPIRCOR -23.65993 151.1197 0 Mud intertidal
summary(seagrass)
## SPECIES LATITUDE LONGITUDE DEPTH
## C_SERRULAT: 1256 Min. :-24.20 Min. :142.5 Min. : 0.0000
## S_ISOETIFO: 101 1st Qu.:-23.79 1st Qu.:146.8 1st Qu.: 0.0000
## T_HEMPRICH: 823 Median :-22.41 Median :150.5 Median : 0.0000
## Z_CAPIRCOR:10201 Mean :-21.06 Mean :149.0 Mean : 0.9253
## 3rd Qu.:-19.17 3rd Qu.:151.3 3rd Qu.: 1.2279
## Max. :-10.75 Max. :151.9 Max. :29.3583
##
## SEDIMENT TIDAL
## Gravel: 1 intertidal:7433
## Mud :8969 subtidal :4948
## Reef : 63
## Rock : 66
## Rubble: 107
## Sand :3151
## Shell : 24
First we plot sea grass according to location (latitude and longitude). Then we will add a third axis (depth) to visualize this in 3-dimensions using plotly package. Since depth is measured in meters below sea level, we visualize this in negative values.
seagrass %>% ggplot() + geom_point(aes(x=LATITUDE, y=LONGITUDE, color=SPECIES))
plot_ly(seagrass, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
We can also see how our categorical predictors relate to sea grass species.
seagrass %>%
ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
seagrass %>%
ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
We will first use random forest to build a classifier and then use a multinomial logistic regression model, and compare the two.
Let’s first partition our data set into train and test sets. Since we have a lot more data here than in the linear regression model, we will partition it by 75%-25%.
# test-train split
seagrass_train_ind <- createDataPartition(y = seagrass$SPECIES, p=0.75, list=FALSE)
train_set <- seagrass[seagrass_train_ind, ]
test_set <- seagrass[-seagrass_train_ind, ]
rf_fit <- randomForest(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT + TIDAL,
data=train_set,
mtry = 2)
rf_fit
##
## Call:
## randomForest(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT + TIDAL, data = train_set, mtry = 2)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 4.71%
## Confusion matrix:
## C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR class.error
## C_SERRULAT 705 22 10 205 0.25159236
## S_ISOETIFO 31 31 4 10 0.59210526
## T_HEMPRICH 43 2 561 12 0.09223301
## Z_CAPIRCOR 93 2 3 7553 0.01280878
rf_pred <- predict(rf_fit, newdata = test_set, type = "response")
confusionMatrix(table(pred = rf_pred, true = test_set$SPECIES))
## Confusion Matrix and Statistics
##
## true
## pred C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
## C_SERRULAT 243 9 16 24
## S_ISOETIFO 4 8 1 1
## T_HEMPRICH 3 3 184 2
## Z_CAPIRCOR 64 5 4 2523
##
## Overall Statistics
##
## Accuracy : 0.956
## 95% CI : (0.9482, 0.963)
## No Information Rate : 0.8242
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8509
##
## Mcnemar's Test P-Value : 9.047e-06
##
## Statistics by Class:
##
## Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity 0.77389 0.320000 0.89756
## Specificity 0.98237 0.998045 0.99723
## Pos Pred Value 0.83219 0.571429 0.95833
## Neg Pred Value 0.97466 0.994481 0.99276
## Prevalence 0.10149 0.008080 0.06626
## Detection Rate 0.07854 0.002586 0.05947
## Detection Prevalence 0.09438 0.004525 0.06206
## Balanced Accuracy 0.87813 0.659022 0.94740
## Class: Z_CAPIRCOR
## Sensitivity 0.9894
## Specificity 0.8658
## Pos Pred Value 0.9719
## Neg Pred Value 0.9458
## Prevalence 0.8242
## Detection Rate 0.8154
## Detection Prevalence 0.8390
## Balanced Accuracy 0.9276
We see that our classification model works quite well, especially for T_HEMPRICH and Z_CAPRICOR, which have 85%+ sensitivity and specificity. However, we got quite a low sensitivity for S_ISOETIFO. Recall to our data wrangling portion that S_ISOETIFO had only about 100 “Yes” observations. Since we had a small sample size for S_ISOETIFO relative to the other 3 seagrass species, this may have contributed to the low sensitivity.
We can visually assess our predicted values with true values of species to see how our model performed.
# true values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers")
# predicted values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~rf_pred, type="scatter3d", mode="markers")
For sediment type:
test_set %>%
ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
test_set %>%
ggplot(aes(SEDIMENT, fill=rf_pred)) + geom_bar(width=.5, position = "dodge")
For seabed type:
test_set %>%
ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
test_set %>%
ggplot(aes(TIDAL, fill=rf_pred)) + geom_bar(width=.5, position = "dodge")
Let’s examine variable importance.
variable_importance <- importance(rf_fit)
tmp <- data_frame(feature = rownames(variable_importance),
Gini = variable_importance[,1]) %>%
arrange(desc(Gini))
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
tmp
## # A tibble: 5 x 2
## feature Gini
## <chr> <dbl>
## 1 LATITUDE 931.
## 2 LONGITUDE 878.
## 3 DEPTH 616.
## 4 TIDAL 115.
## 5 SEDIMENT 86.0
We see that longitude and latitude were very predictive of presence of seagrass followed by depth from sea level. The types of sediment and seabed (intertidal or subtidal seabed) are not very good predictors. Thus, it seems that the location of where the sea grass was discovered matters more than the various ocean floor properties.
tmp %>% ggplot(aes(x=reorder(feature, Gini), y=Gini)) +
geom_bar(stat='identity') +
coord_flip() + xlab("Feature") +
theme(axis.text=element_text(size=8))
We can now try a multinomial logistic regression model to see how it compares to random forest. We will use the nnet package.
The logistic regression model is as follows:
multinom_fit <- multinom(SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT, data=train_set)
## # weights: 44 (30 variable)
## initial value 12874.515732
## iter 10 value 3313.265166
## iter 20 value 2723.852573
## iter 30 value 2565.896265
## iter 40 value 2503.556100
## iter 50 value 2474.035234
## iter 60 value 2451.155274
## iter 70 value 2451.106035
## iter 80 value 2451.001366
## iter 90 value 2449.174920
## iter 100 value 2448.479063
## final value 2448.479063
## stopped after 100 iterations
summary(multinom_fit)
## Call:
## multinom(formula = SPECIES ~ LATITUDE + LONGITUDE + DEPTH + SEDIMENT,
## data = train_set)
##
## Coefficients:
## (Intercept) LATITUDE LONGITUDE DEPTH SEDIMENTMud
## S_ISOETIFO -385.0541 2.1687827 2.778821 0.001477874 13.546418
## T_HEMPRICH -160.9830 1.8802818 1.264179 -0.311850477 6.952428
## Z_CAPIRCOR -301.5464 0.7094671 1.489861 -0.707840969 98.630442
## SEDIMENTReef SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO -56.575700 14.380473 -29.68991 14.66912 15.43732
## T_HEMPRICH 9.809814 9.855825 11.58179 10.12414 10.17451
## Z_CAPIRCOR 20.151148 95.419866 94.53164 97.80779 97.59584
##
## Std. Errors:
## (Intercept) LATITUDE LONGITUDE DEPTH SEDIMENTMud
## S_ISOETIFO 0.002841091 0.05770350 0.007860097 0.02442800 0.4042663
## T_HEMPRICH 0.001276092 0.05726438 0.007293784 0.04166785 0.3294196
## Z_CAPIRCOR 0.002092067 0.02798120 0.004569543 0.02699023 0.3573032
## SEDIMENTReef SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 5.515254e-14 0.6795060 4.212678e-15 0.3748700 0.8839106
## T_HEMPRICH 5.580735e-01 0.6413926 4.695396e-01 0.3014330 1.1506238
## Z_CAPIRCOR 1.662192e-33 0.6603028 1.026883e+00 0.3586384 0.8662218
##
## Residual Deviance: 4896.958
## AIC: 4956.958
Relative risk ratios where reference group is C_SERRULAT
exp(coef(multinom_fit))
## (Intercept) LATITUDE LONGITUDE DEPTH SEDIMENTMud SEDIMENTReef
## S_ISOETIFO 5.931185e-168 8.747629 16.100021 1.0014790 7.640729e+05 2.688350e-25
## T_HEMPRICH 1.218886e-70 6.555352 3.540184 0.7320910 1.045685e+03 1.821160e+04
## Z_CAPIRCOR 1.096588e-131 2.032908 4.436480 0.4927068 6.833715e+42 5.643291e+08
## SEDIMENTRock SEDIMENTRubble SEDIMENTSand SEDIMENTShell
## S_ISOETIFO 1.759381e+06 1.275954e-13 2.348111e+06 5.062256e+06
## T_HEMPRICH 1.906910e+04 1.071295e+05 2.493792e+04 2.622622e+04
## Z_CAPIRCOR 2.756268e+41 1.133883e+41 3.001819e+42 2.428484e+42
# predicted probabilities
predicted_prob <- predict(multinom_fit, newdata=test_set, type="probs")
# predicted classes
predicted_class <- predict(multinom_fit, newdata=test_set, type="class")
confusionMatrix(data = predicted_class, reference = test_set$SPECIES )
## Confusion Matrix and Statistics
##
## Reference
## Prediction C_SERRULAT S_ISOETIFO T_HEMPRICH Z_CAPIRCOR
## C_SERRULAT 125 9 6 15
## S_ISOETIFO 0 0 0 0
## T_HEMPRICH 16 2 177 20
## Z_CAPIRCOR 173 14 22 2515
##
## Overall Statistics
##
## Accuracy : 0.9105
## 95% CI : (0.8999, 0.9203)
## No Information Rate : 0.8242
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6618
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: C_SERRULAT Class: S_ISOETIFO Class: T_HEMPRICH
## Sensitivity 0.3981 0.00000 0.86341
## Specificity 0.9892 1.00000 0.98685
## Pos Pred Value 0.8065 NaN 0.82326
## Neg Pred Value 0.9357 0.99192 0.99027
## Prevalence 0.1015 0.00808 0.06626
## Detection Rate 0.0404 0.00000 0.05721
## Detection Prevalence 0.0501 0.00000 0.06949
## Balanced Accuracy 0.6936 0.50000 0.92513
## Class: Z_CAPIRCOR
## Sensitivity 0.9863
## Specificity 0.6158
## Pos Pred Value 0.9233
## Neg Pred Value 0.9054
## Prevalence 0.8242
## Detection Rate 0.8129
## Detection Prevalence 0.8804
## Balanced Accuracy 0.8010
We see that our multinomial logistic model has about 91% overall accuracy, which performs a bit worse than random forest. However, the model performs very badly in predicting for S_ISOETIFO.
The model seems to predict T_HEMPRICH the best with 88.7% sensitivity and 98.8% specificity. The model also does not perform well for sensitivity of C_SERRULAT, with only about 41.7% sensitivity.
Again, we can assess our results visually:
# true values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~SPECIES, type="scatter3d", mode="markers")
# predicted values
plot_ly(test_set, x=~LATITUDE, y=~LONGITUDE, z=~-DEPTH, color=~predicted_class, type="scatter3d", mode="markers")
For sediment type:
test_set %>%
ggplot(aes(SEDIMENT, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
test_set %>%
ggplot(aes(SEDIMENT, fill=predicted_class)) + geom_bar(width=.5, position = "dodge")
For seabed type:
test_set %>%
ggplot(aes(TIDAL, fill=SPECIES)) + geom_bar(width=.5, position = "dodge")
test_set %>%
ggplot(aes(TIDAL, fill=predicted_class)) + geom_bar(width=.5, position = "dodge")
We can confirm in our visualizations that the logistic regression model failed to predict for S_ISOETIFO.
So, we see that the overall accuracy for multinomial logistic regression model vs random forest model was 95.6% and 90.9%, respectively. However, the overall accuracy stated for the logistic regression model is deceiving since it did not perform well in sensitivity in 2 out of 4 classes.